{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 线上兼职\n",
    "\n",
    "### 背景\n",
    "* 随着经济地快速发展，生活水平的不断提高，物价的持续上涨，很多人纷纷感到自己的工资不能让自己轻松的活着，于是很多的白领、宝妈，学生等都会在空余时间找兼职而轻松而来钱快的线上兼职尤为受欢迎，于是诞生了挖掘线上兼职公众号的想法。\n",
    "### 数据加值宣言  \n",
    "* 本项目产出按公众号文章标题、链接以及时间挖掘的关于线上兼职的数据，以解决白领、宝妈，学生等人群闲暇时期对线上兼职的需求问题\n",
    "### 数据最小可用产品\n",
    "* MVP的数据加值：数据产品的数据类型包括微信公众号《线上任务》里文章的标题、链接以及时间；最后还导出了相关的公众号名称与链接，供用户查询。  \n",
    "  对微信公众号《线上任务》的基本信息数据挖掘，可帮助用户更快的筛选自己想要做得兼职，并点开链接查看相关内容。  \n",
    "  最后所导出的相关的公众号名称与链接，为用户提供了更多的选择。\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 准备工作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "'''\n",
    "阿里研究院\n",
    "阿里健康\n",
    "阿里巴巴商学院\n",
    "阿里数据\n",
    "\n",
    "腾讯金融科技\n",
    "腾讯研究院\n",
    "腾讯媒体研究院\n",
    "腾讯云启研究院\n",
    "酷鹅用户研究院\n",
    "'''\n",
    "公众号 = \"线上任务\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "fn = { \"output\" : { \"公众号_htm_snippets\": \"data_raw_src/公众号_htm_snippets_{公众号}.tsv\",\n",
    "                    \"公众号_df\": \"data_raw_src/公众号_df_{公众号}.tsv\",\n",
    "                    \"公众号_xlsx\": \"data_sets/公众号_url_{公众号}.xlsx\" } \\\n",
    "      }"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 采集公众号（requests）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0\n",
      "5\n",
      "10\n",
      "15\n",
      "20\n",
      "25\n",
      "30\n",
      "35\n",
      "40\n",
      "45\n",
      "50\n",
      "55\n",
      "60\n",
      "65\n",
      "70\n",
      "75\n",
      "80\n",
      "85\n",
      "90\n",
      "95\n",
      "100\n",
      "105\n",
      "110\n",
      "115\n",
      "120\n",
      "125\n",
      "130\n",
      "135\n",
      "140\n",
      "145\n",
      "150\n",
      "155\n",
      "160\n",
      "165\n",
      "170\n",
      "175\n",
      "180\n",
      "185\n",
      "190\n",
      "195\n",
      "200\n",
      "205\n",
      "210\n",
      "215\n",
      "220\n",
      "225\n",
      "230\n",
      "235\n",
      "240\n",
      "245\n",
      "250\n",
      "255\n",
      "260\n",
      "265\n",
      "270\n",
      "275\n"
     ]
    }
   ],
   "source": [
    "# 目标url\n",
    "\n",
    "import time\n",
    "import requests\n",
    "import pandas as pd\n",
    "import csv\n",
    "\n",
    "\n",
    "url = \"https://mp.weixin.qq.com/cgi-bin/appmsg\"\n",
    "\n",
    "# 使用Cookie，跳过登陆操作\n",
    "headers = {\n",
    "  \"Cookie\": \"ua_id=uR1tbxix87liIboKAAAAADrAb4Y36Mcsr31FizcOe0M=; pgv_pvi=7331941376; mm_lang=zh_CN; RK=DfDEOKpSdW; ptcz=f68cbec1a3f252c57050d50a05928efb0b64e306d1a4c794a47fe8ef9c6b6aa9; pgv_pvid=8328731374; luin=o0948784109; tvfe_boss_uuid=8395737cb65bd53c; o_cookie=948784109; pac_uid=1_948784109; lskey=00010000bb511c85c7dae6295269db82a2a8f861223054e3dc036b810c87712c50996e73377ce910bbee9f38; noticeLoginFlag=1; openid2ticket_oRRsT1Vn3LJ1uPbeh3OG_7SoWmWk=BTIZdPsJKH5yvbiXvR2V3fFV7zZv5yKYbnoqMa/T3vU=; ptui_loginuin=948784109; _qpsvr_localtk=1589614260390; pgv_si=s5631079424; uuid=6a4f04b1d20fcc5a0aceb6601ef0a727; rand_info=CAESILodkiuBk5RNeQjc+yabuY3PGIceKdOWWZocN0zEvMtw; slave_bizuin=3574799173; data_bizuin=3574799173; bizuin=3574799173; data_ticket=65N0k55w3jrfUnAUvpRZDv5goWOMQMd3053ViPEviGlIOW1u0DcldrLvIvgR0XmJ; slave_sid=QkVSb1M4QlE5ekxSUE4weHJvTDlEdTFXUkhKcTcwNVRNOTBmMGYwUzdzWmFhbU5uSkhnYkJ4OWo1X1Ffc3YxTjRhbjliSHhaNGZDX1RZTG9kVUplNUpQS2hMRWhQdFp5cXNsX2RiYlV5X3lQTTMyU084NHNWQ255OHdWVVhUM0xTQWtrNDFtb0N6RG9WT2p1; slave_user=gh_0bf1ee33b1b8; xid=7962a993c8b92ebd4637316fd248e2cb\",\n",
    "  \"User-Agent\": \"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36\"}\n",
    "\n",
    "data = {\n",
    "    \"token\": \"667433495\",\n",
    "    \"lang\": \"zh_CN\",\n",
    "    \"f\": \"json\",\n",
    "    \"ajax\": \"1\",\n",
    "    \"action\": \"list_ex\",\n",
    "    \"begin\": \"0\",\n",
    "    \"count\": \"5\",\n",
    "    \"query\": \"\",\n",
    "    \"fakeid\": \"MzUzNTkxOTQ2Nw==\",\n",
    "    \"type\": \"9\",\n",
    "}\n",
    "\n",
    "\n",
    "\n",
    "content_list=[]\n",
    "\n",
    "for i in range(56):\n",
    "    data[\"begin\"] = i*5\n",
    "    print(data[\"begin\"])\n",
    "    time.sleep(3)\n",
    "    # 使用get方法进行提交\n",
    "    content_json = requests.get(url, headers=headers, params=data).json()\n",
    "    # print(content_json)\n",
    "    # 返回了一个json，里面是每一页的数据\n",
    "    for item in content_json[\"app_msg_list\"]:\n",
    "    # 提取每页文章的标题及对应的url\n",
    "        items = []\n",
    "        items.append(item[\"title\"])\n",
    "        items.append(item[\"link\"])\n",
    "        items.append(item[\"create_time\"])\n",
    "        content_list.append(items)\n",
    "\n",
    "\n",
    "name=['title','link','create_time']\n",
    "test=pd.DataFrame(columns=name,data=content_list)\n",
    "with pd.ExcelWriter(fn[\"output\"][\"公众号_xlsx\"].format(公众号=\"线上任务全部数据\")) as writer:\n",
    "    test.to_excel(writer)\n",
    "\n",
    "# test.to_csv(\"../微信公众号爬虫_zhichao/南方周末.csv\",mode='a',encoding='utf-8')\n",
    "# print(\"保存成功\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 采集公众号（selenium）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 142,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from lxml.html import fromstring\n",
    "import time\n",
    "from random import random\n",
    "\n",
    "# when selenium main_content is used\n",
    "# Parses an HTML document from a string constant.  Returns the root nood\n",
    "# root = fromstring(df.loc[1,\"html_snippets\"]) "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 使用Selenium\n",
    "* 要更改 opts.binary_location 至自己本地的Chrome浏览器，建议portable\n",
    "* Chrome浏览器 和 chromedriver.exe要同版本号到小数后一位\n",
    "* 要确保可以 开启浏览器机器人\n",
    "* 要确保浏览器机器人 可以打开网页 driver.get(\"https://mp.weixin.qq.com\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 143,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Users\\Administrator\\anaconda3\\lib\\site-packages\\ipykernel_launcher.py:18: DeprecationWarning: use options instead of chrome_options\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "\n",
    "opts.binary_location = r\"C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "# \"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://mp.weixin.qq.com\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 填表登入"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "selenium 的定位方法\n",
    "* find_element_by_id &ensp;&ensp;&ensp;  根据标签id定位\n",
    "* find_element_by_name   &ensp;&ensp;&ensp; 根据标签的name定位\n",
    "* find_element_by_xpath  &ensp;&ensp;&ensp; 根据xpath定位\n",
    "* find_element_by_link_text  &ensp;&ensp;&ensp; 通过文字链接来定位元素\n",
    "* find_element_by_partial_link_text  &ensp;&ensp;&ensp;  通过文字链接来定位元素\n",
    "* find_element_by_tag_name  &ensp;&ensp;&ensp;  根据标签的名字定位\n",
    "* find_element_by_class_name  &ensp;&ensp;&ensp; 通过class name 定位\n",
    "* find_element_by_css_selector  &ensp;&ensp;&ensp;  根据元素属性来定位"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 145,
   "metadata": {},
   "outputs": [],
   "source": [
    "payload =  {\"account\": \"948784109@qq.com\", \"password\": \"zwl2000112\"}\n",
    "# payload =  {\"account\": \"NFUHacks@163.com\", \"password\": \"NFU706947580\"}\n",
    "driver.find_element_by_xpath('//div[@class=\"login__type__container login__type__container__scan\"]/a').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "WebDriver 常用方法：\n",
    "* clear()清楚文本\n",
    "* send_keys(values)模拟按键输入\n",
    "* click()模拟点击\n",
    "* submit模拟提交"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 146,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"account\"]').clear()\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"account\"]').send_keys(payload['account'])\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').clear()\n",
    "driver.find_element_by_xpath('//form[@class=\"login_form\"]//input[@name=\"password\"]').send_keys(payload['password'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//div[@class=\"login_btn_panel\"]/a').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 点选单"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "其他常用方法\n",
    "* size：返回元素的尺寸\n",
    "* text：获取元素的文本\n",
    "* get_attribute：获取属性值  &ensp;&ensp;&ensp; get_attribute('innerHTML')获取元素内的全部HTML\n",
    "* is_displayed()：设置该元素用户是否可见"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'展开'"
      ]
     },
     "execution_count": 148,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//a[@id=\"m_open\"]')\n",
    "element.click()\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.execute_script(\"window.scrollTo(0,document.body.scrollHeight)\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'https://mp.weixin.qq.com/cgi-bin/appmsg?begin=0&count=10&t=media/appmsg_list&type=10&action=list&token=12898601&lang=zh_CN'"
      ]
     },
     "execution_count": 150,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//li[@title[contains(.,\"素材管理\")]]/a') \n",
    "# main_content = element.get_attribute('innerHTML')\n",
    "# main_content\n",
    "url2= element.get_attribute(\"href\")\n",
    "url2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(url2)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 新建图文消息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[text()[contains(.,\"新建图文消息\")]]') \n",
    "main_content = element.get_attribute('innerHTML')\n",
    "main_content\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['CDwindow-521A7904F87882D4D59C09E8EA96B050', 'CDwindow-6B35375149FE490CDE635BB1B7CB2811']\n"
     ]
    }
   ],
   "source": [
    "print (driver.window_handles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 154,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 新建图文消息开了另一分视窗，所以要切换 switch_to \n",
    "driver.switch_to.window(driver.window_handles[-1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 超链接"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 155,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                超链接              \n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//*[text()[contains(.,\"超链接\")]]') \n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 156,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "选择其他公众号\n"
     ]
    }
   ],
   "source": [
    "# 点 选择其他公众号\n",
    "element = driver.find_element_by_xpath('//*[text()[contains(.,\"选择其他公众号\")]]') \n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 157,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//form//div[@class=\"inner_link_account_area\"]//input[@class=\"weui-desktop-form__input\"]').clear()\n",
    "driver.find_element_by_xpath('//form//div[@class=\"inner_link_account_area\"]//input[@class=\"weui-desktop-form__input\"]').send_keys(公众号)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 158,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-icon weui-desktop-icon__inputSearch weui-desktop-icon__small\"><!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <!----> <svg width=\"16\" height=\"16\" viewBox=\"0 0 16 16\" xmlns=\"http://www.w3.org/2000/svg\"><path d=\"M11.33 10.007l4.273 4.273a.502.502 0 0 1 .005.709l-.585.584a.499.499 0 0 1-.709-.004L10.046 11.3a6.278 6.278 0 1 1 1.284-1.294zm.012-3.729a5.063 5.063 0 1 0-10.127 0 5.063 5.063 0 0 0 10.127 0z\"></path></svg> <!----> <!----> <!----> <!----></div>\n"
     ]
    }
   ],
   "source": [
    "# 点放大镜搜\n",
    "element = driver.find_element_by_xpath('//button[@class=\"weui-desktop-icon-btn weui-desktop-search__btn\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 159,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/dmOmN0bcG0Q9uHF9URNkSkMZ81Co3E5cj4Bb9Zx0psTlm5VM1QSZ3iav9VexOia7sKriciczu5Wjvhu1YRzwicL9Xmw/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">线上任务</strong> <i class=\"inner_link_account_wechat\">微信号：zhangshangrenwu</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/c5ibpQYtTDIG0wCcC5nOWQGVqEKDYBeHmNlxkdVIwCyYuN5FHE4jhb4dyAicTPP3HGToU6RbiaZahvnpewYsq6o7g/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">有赚线上任务</strong> <i class=\"inner_link_account_wechat\">微信号：youzhuanrenwu</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/yjgqVp8oysjgmgK2fDds6L8c1JsVkZdlD0OhtaiaadW1POzJiaX2vfBuGtN8wiarKvtSicZItibhRreW1NCadpC9UWw/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">有赚线上任务日结</strong> <i class=\"inner_link_account_wechat\">微信号：未设置</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/mTAOobASk26fFKerPubYNTwYeR0K7XMKK9HOJ8AWqvqJzribDyu4ibciad0eGia6DIVZkSpEYYFp3sPNJibzslxhnfg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">人人译</strong> <i class=\"inner_link_account_wechat\">微信号：minitranslate</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">服务号</div></li><li class=\"inner_link_account_item\"><div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/3gXic5X4icoHgkMT9MFLyaOcBg61WspR85WTWz8ChnwIdldIBFEYib9rciawkCPBHdNvia1icGXcOAibOxGHBbBoss5Rg/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">橙兼职在线任务</strong> <i class=\"inner_link_account_wechat\">微信号：cjz-renwu</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div></li>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "公众号SERP = main_content\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 160,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 解析\n",
    "root = fromstring(公众号SERP) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 161,
   "metadata": {},
   "outputs": [],
   "source": [
    "主 = root.xpath('//li[@class=\"inner_link_account_item\"]')\n",
    "\n",
    "account_list = []\n",
    "for e in 主:\n",
    "    account_nickname = e.xpath('./div/strong[@class=\"inner_link_account_nickname\"]')[0].text\n",
    "    account_wechat = e.xpath('./div/i[@class=\"inner_link_account_wechat\"]')[0].text\n",
    "    account_img = e.xpath('./div/img/@src')[0]\n",
    "    account = {\"nickname\": account_nickname, \"wechat\": account_wechat, \"img\": account_img,}\n",
    "    account_list.append(account)\n",
    "\n",
    "df_account = pd.DataFrame(account_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 162,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>nickname</th>\n",
       "      <th>wechat</th>\n",
       "      <th>img</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>线上任务</td>\n",
       "      <td>微信号：zhangshangrenwu</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/dmOmN0bcG0Q9uHF...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>有赚线上任务</td>\n",
       "      <td>微信号：youzhuanrenwu</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/c5ibpQYtTDIG0wC...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>有赚线上任务日结</td>\n",
       "      <td>微信号：未设置</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/yjgqVp8oysjgmgK...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>人人译</td>\n",
       "      <td>微信号：minitranslate</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/mTAOobASk26fFKe...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>橙兼职在线任务</td>\n",
       "      <td>微信号：cjz-renwu</td>\n",
       "      <td>http://mmbiz.qpic.cn/mmbiz_png/3gXic5X4icoHgkM...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   nickname               wechat  \\\n",
       "0      线上任务  微信号：zhangshangrenwu   \n",
       "1    有赚线上任务    微信号：youzhuanrenwu   \n",
       "2  有赚线上任务日结              微信号：未设置   \n",
       "3       人人译    微信号：minitranslate   \n",
       "4   橙兼职在线任务        微信号：cjz-renwu   \n",
       "\n",
       "                                                 img  \n",
       "0  http://mmbiz.qpic.cn/mmbiz_png/dmOmN0bcG0Q9uHF...  \n",
       "1  http://mmbiz.qpic.cn/mmbiz_png/c5ibpQYtTDIG0wC...  \n",
       "2  http://mmbiz.qpic.cn/mmbiz_png/yjgqVp8oysjgmgK...  \n",
       "3  http://mmbiz.qpic.cn/mmbiz_png/mTAOobASk26fFKe...  \n",
       "4  http://mmbiz.qpic.cn/mmbiz_png/3gXic5X4icoHgkM...  "
      ]
     },
     "execution_count": 162,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_account"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 163,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<div class=\"weui-desktop-vm_primary\"><img src=\"http://mmbiz.qpic.cn/mmbiz_png/dmOmN0bcG0Q9uHF9URNkSkMZ81Co3E5cj4Bb9Zx0psTlm5VM1QSZ3iav9VexOia7sKriciczu5Wjvhu1YRzwicL9Xmw/0?wx_fmt=png\" class=\"inner_link_account_avatar\"> <strong class=\"inner_link_account_nickname\">线上任务</strong> <i class=\"inner_link_account_wechat\">微信号：zhangshangrenwu</i></div> <div class=\"weui-desktop-vm_default inner_link_account_type\">订阅号</div>\n"
     ]
    }
   ],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"inner_link_account_list\"]/li')\n",
    "main_content = element.get_attribute('innerHTML')\n",
    "print(main_content)\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 164,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'\\n跳转_input = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/input\\')\\n跳转_a = driver.find_element_by_xpath(\\'//span[@class=\"weui-desktop-pagination__form\"]/a\\')\\n跳转_input.clear()\\n跳转_input.send_keys(2)\\n跳转_a.click()\\n'"
      ]
     },
     "execution_count": 164,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 跳转testing\n",
    "'''\n",
    "跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "跳转_input.clear()\n",
    "跳转_input.send_keys(2)\n",
    "跳转_a.click()\n",
    "'''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 165,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 56]\n",
      "False\n"
     ]
    }
   ],
   "source": [
    "# 跳转上限\n",
    "l_e = driver.find_elements_by_xpath('//label[@class=\"weui-desktop-pagination__num\"]')\n",
    "l_e_int  = [int(x.text) for x in l_e] \n",
    "print (l_e_int)\n",
    "print (l_e_int[0]==l_e_int[-1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 166,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(l_e_int[0],l_e_int[-1]+1 ))\n",
    "#print(pages[0:2])\n",
    "pages = list(range(1,l_e_int[-1]+1 ))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 循环/遍历"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 167,
   "metadata": {},
   "outputs": [],
   "source": [
    "# global varialbes \n",
    "html_raw = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 168,
   "metadata": {},
   "outputs": [],
   "source": [
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "\n",
    "        跳转_input = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/input')\n",
    "        跳转_a = driver.find_element_by_xpath('//span[@class=\"weui-desktop-pagination__form\"]/a')\n",
    "        跳转_input.clear()\n",
    "        跳转_input.send_keys(p)\n",
    "        跳转_a.click()\n",
    "\n",
    "        time.sleep(45+120*random())\n",
    "\n",
    "        element = driver.find_element_by_xpath('//div[@class=\"inner_link_article_list\"]')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        #print(main_content)\n",
    "        html_raw[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 169,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t20\t21\t22\t23\t24\t25\t26\t27\t28\t29\t30\t31\t32\t33\t34\t35\t36\t37\t38\t39\t40\t41\t42\t43\t44\t45\t46\t47\t48\t49\t50\t51\t52\t53\t54\t55\t56\t"
     ]
    }
   ],
   "source": [
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 170,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>50</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>51</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>52</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>54</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>55</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>56</th>\n",
       "      <td>&lt;div&gt;&lt;label class=\"inner_link_article_item\"&gt;&lt;s...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   <div><label class=\"inner_link_article_item\"><s...\n",
       "2   <div><label class=\"inner_link_article_item\"><s...\n",
       "3   <div><label class=\"inner_link_article_item\"><s...\n",
       "4   <div><label class=\"inner_link_article_item\"><s...\n",
       "5   <div><label class=\"inner_link_article_item\"><s...\n",
       "6   <div><label class=\"inner_link_article_item\"><s...\n",
       "7   <div><label class=\"inner_link_article_item\"><s...\n",
       "8   <div><label class=\"inner_link_article_item\"><s...\n",
       "9   <div><label class=\"inner_link_article_item\"><s...\n",
       "10  <div><label class=\"inner_link_article_item\"><s...\n",
       "11  <div><label class=\"inner_link_article_item\"><s...\n",
       "12  <div><label class=\"inner_link_article_item\"><s...\n",
       "13  <div><label class=\"inner_link_article_item\"><s...\n",
       "14  <div><label class=\"inner_link_article_item\"><s...\n",
       "15  <div><label class=\"inner_link_article_item\"><s...\n",
       "16  <div><label class=\"inner_link_article_item\"><s...\n",
       "17  <div><label class=\"inner_link_article_item\"><s...\n",
       "18  <div><label class=\"inner_link_article_item\"><s...\n",
       "19  <div><label class=\"inner_link_article_item\"><s...\n",
       "20  <div><label class=\"inner_link_article_item\"><s...\n",
       "21  <div><label class=\"inner_link_article_item\"><s...\n",
       "22  <div><label class=\"inner_link_article_item\"><s...\n",
       "23  <div><label class=\"inner_link_article_item\"><s...\n",
       "24  <div><label class=\"inner_link_article_item\"><s...\n",
       "25  <div><label class=\"inner_link_article_item\"><s...\n",
       "26  <div><label class=\"inner_link_article_item\"><s...\n",
       "27  <div><label class=\"inner_link_article_item\"><s...\n",
       "28  <div><label class=\"inner_link_article_item\"><s...\n",
       "29  <div><label class=\"inner_link_article_item\"><s...\n",
       "30  <div><label class=\"inner_link_article_item\"><s...\n",
       "31  <div><label class=\"inner_link_article_item\"><s...\n",
       "32  <div><label class=\"inner_link_article_item\"><s...\n",
       "33  <div><label class=\"inner_link_article_item\"><s...\n",
       "34  <div><label class=\"inner_link_article_item\"><s...\n",
       "35  <div><label class=\"inner_link_article_item\"><s...\n",
       "36  <div><label class=\"inner_link_article_item\"><s...\n",
       "37  <div><label class=\"inner_link_article_item\"><s...\n",
       "38  <div><label class=\"inner_link_article_item\"><s...\n",
       "39  <div><label class=\"inner_link_article_item\"><s...\n",
       "40  <div><label class=\"inner_link_article_item\"><s...\n",
       "41  <div><label class=\"inner_link_article_item\"><s...\n",
       "42  <div><label class=\"inner_link_article_item\"><s...\n",
       "43  <div><label class=\"inner_link_article_item\"><s...\n",
       "44  <div><label class=\"inner_link_article_item\"><s...\n",
       "45  <div><label class=\"inner_link_article_item\"><s...\n",
       "46  <div><label class=\"inner_link_article_item\"><s...\n",
       "47  <div><label class=\"inner_link_article_item\"><s...\n",
       "48  <div><label class=\"inner_link_article_item\"><s...\n",
       "49  <div><label class=\"inner_link_article_item\"><s...\n",
       "50  <div><label class=\"inner_link_article_item\"><s...\n",
       "51  <div><label class=\"inner_link_article_item\"><s...\n",
       "52  <div><label class=\"inner_link_article_item\"><s...\n",
       "53  <div><label class=\"inner_link_article_item\"><s...\n",
       "54  <div><label class=\"inner_link_article_item\"><s...\n",
       "55  <div><label class=\"inner_link_article_item\"><s...\n",
       "56  <div><label class=\"inner_link_article_item\"><s..."
      ]
     },
     "execution_count": 170,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame([html_raw]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 171,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Stored 'html_raw' (dict)\n"
     ]
    }
   ],
   "source": [
    "%store html_raw\n",
    "import pickle \n",
    "filehandler = open(\"html_raw\", 'wb') \n",
    "pickle.dump(html_raw, filehandler)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "56\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [html_snippets]\n",
       "Index: []"
      ]
     },
     "execution_count": 172,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_out = df[~df.duplicated()]\n",
    "print (len(df_out))\n",
    "df[df.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 173,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "try_again = list(df[df.duplicated()].index)\n",
    "print(try_again)\n",
    "try_again = try_again + list (set(pages).difference(set(df.index.values)))\n",
    "try_again"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 暂存档"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = fn [\"output\"] [\"公众号_htm_snippets\"] \n",
    "df_out.to_csv(filename.format(公众号=公众号), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "14,12,11,11,10,10,10,10,10,10,10,10,10,10,10,13,15,15,15,15,18,16,15,15,15,20,21,19,17,15,16,19,16,20,18,18,23,19,25,21,19,15,15,13,15,14,15,16,7,5,5,5,5,5,12,9,"
     ]
    }
   ],
   "source": [
    "def parse_html_snippets(_snippet_):\n",
    "    root = fromstring(_snippet_) \n",
    "    title = [x.text for x in root.xpath('//div[@class=\"inner_link_article_title\"]')]\n",
    "    create_time = [x.text for x in root.xpath('//div[@class=\"inner_link_article_date\"]')]\n",
    "    link = [x for x in root.xpath('//a/@href')]\n",
    "    _df_ = pd.DataFrame({\"title\":title, \"create_time\": create_time, \"link\":link})\n",
    "    return(_df_)\n",
    "    \n",
    "l_df = []\n",
    "for p in pages:\n",
    "    _df_ = parse_html_snippets(df.loc[p,\"html_snippets\"])\n",
    "    print (len(_df_), end=\",\")\n",
    "    l_df.append(_df_)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！</td>\n",
       "      <td>2019-12-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>超准！来测测你的2020年的运势！</td>\n",
       "      <td>2019-12-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>日薪300！每个月还额外送你100元奖励！</td>\n",
       "      <td>2019-12-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！</td>\n",
       "      <td>2019-12-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>超准！来测测你的2020年的运势！</td>\n",
       "      <td>2019-12-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>日薪300！每个月还额外送你100元奖励！</td>\n",
       "      <td>2019-12-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>过完双12还有圣诞节！有钱的秘密都在这！</td>\n",
       "      <td>2019-12-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>超准！来测测你的2020年的运势！</td>\n",
       "      <td>2019-12-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>日薪300！每个月还额外送你100元奖励！</td>\n",
       "      <td>2019-12-22</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>周末在家，还能躺着赚点零花钱！日薪百元！要不要试试！</td>\n",
       "      <td>2019-12-21</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>超准！来测测你的2020年的运势！</td>\n",
       "      <td>2019-12-21</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                          title create_time  \\\n",
       "0   想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！  2019-12-24   \n",
       "1             超准！来测测你的2020年的运势！  2019-12-24   \n",
       "2         日薪300！每个月还额外送你100元奖励！  2019-12-24   \n",
       "3   想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！  2019-12-23   \n",
       "4             超准！来测测你的2020年的运势！  2019-12-23   \n",
       "5         日薪300！每个月还额外送你100元奖励！  2019-12-23   \n",
       "6          过完双12还有圣诞节！有钱的秘密都在这！  2019-12-22   \n",
       "7             超准！来测测你的2020年的运势！  2019-12-22   \n",
       "8         日薪300！每个月还额外送你100元奖励！  2019-12-22   \n",
       "9    周末在家，还能躺着赚点零花钱！日薪百元！要不要试试！  2019-12-21   \n",
       "10            超准！来测测你的2020年的运势！  2019-12-21   \n",
       "\n",
       "                                                 link  \n",
       "0   http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "1   http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "2   http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "3   http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "4   http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "5   http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "6   http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "7   http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "8   http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "9   http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "10  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  "
      ]
     },
     "execution_count": 176,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_url_out.loc[0:10]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>765</th>\n",
       "      <td>错误词语挑选员6元/次 新的一周也要努力工作鸭！</td>\n",
       "      <td>2018-12-10</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>766</th>\n",
       "      <td>提前制定新一年计划</td>\n",
       "      <td>2018-12-10</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>767</th>\n",
       "      <td>电话叫早员100元/次 冬天再冷也挡不住我们赚钱的热情！</td>\n",
       "      <td>2018-12-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>768</th>\n",
       "      <td>冬季的理想生活</td>\n",
       "      <td>2018-12-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>769</th>\n",
       "      <td>阿迪耐克工厂招兼职 没钱买新衣？不，我不允许你们这样！</td>\n",
       "      <td>2018-12-07</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>770</th>\n",
       "      <td>这是我见过的最厉害的帽子戏法</td>\n",
       "      <td>2018-12-07</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>771</th>\n",
       "      <td>开学！不出宿舍，日赚百元！</td>\n",
       "      <td>2018-09-03</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                            title create_time  \\\n",
       "765      错误词语挑选员6元/次 新的一周也要努力工作鸭！  2018-12-10   \n",
       "766                     提前制定新一年计划  2018-12-10   \n",
       "767  电话叫早员100元/次 冬天再冷也挡不住我们赚钱的热情！  2018-12-08   \n",
       "768                       冬季的理想生活  2018-12-08   \n",
       "769   阿迪耐克工厂招兼职 没钱买新衣？不，我不允许你们这样！  2018-12-07   \n",
       "770                这是我见过的最厉害的帽子戏法  2018-12-07   \n",
       "771                 开学！不出宿舍，日赚百元！  2018-09-03   \n",
       "\n",
       "                                                  link  \n",
       "765  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "766  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "767  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "768  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "769  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "770  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "771  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  "
      ]
     },
     "execution_count": 179,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out.tail(7)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>value</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！</td>\n",
       "      <td>2019-12-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>超准！来测测你的2020年的运势！</td>\n",
       "      <td>2019-12-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>日薪300！每个月还额外送你100元奖励！</td>\n",
       "      <td>2019-12-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！</td>\n",
       "      <td>2019-12-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>超准！来测测你的2020年的运势！</td>\n",
       "      <td>2019-12-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>767</th>\n",
       "      <td>电话叫早员100元/次 冬天再冷也挡不住我们赚钱的热情！</td>\n",
       "      <td>2018-12-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>768</th>\n",
       "      <td>冬季的理想生活</td>\n",
       "      <td>2018-12-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>769</th>\n",
       "      <td>阿迪耐克工厂招兼职 没钱买新衣？不，我不允许你们这样！</td>\n",
       "      <td>2018-12-07</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>770</th>\n",
       "      <td>这是我见过的最厉害的帽子戏法</td>\n",
       "      <td>2018-12-07</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>771</th>\n",
       "      <td>开学！不出宿舍，日赚百元！</td>\n",
       "      <td>2018-09-03</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>617 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                              title create_time  \\\n",
       "value                                             \n",
       "0       想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！  2019-12-24   \n",
       "1                 超准！来测测你的2020年的运势！  2019-12-24   \n",
       "2             日薪300！每个月还额外送你100元奖励！  2019-12-24   \n",
       "3       想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！  2019-12-23   \n",
       "4                 超准！来测测你的2020年的运势！  2019-12-23   \n",
       "...                             ...         ...   \n",
       "767    电话叫早员100元/次 冬天再冷也挡不住我们赚钱的热情！  2018-12-08   \n",
       "768                         冬季的理想生活  2018-12-08   \n",
       "769     阿迪耐克工厂招兼职 没钱买新衣？不，我不允许你们这样！  2018-12-07   \n",
       "770                  这是我见过的最厉害的帽子戏法  2018-12-07   \n",
       "771                   开学！不出宿舍，日赚百元！  2018-09-03   \n",
       "\n",
       "                                                    link  \n",
       "value                                                     \n",
       "0      http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "1      http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "2      http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "3      http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "4      http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "...                                                  ...  \n",
       "767    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "768    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "769    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "770    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "771    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "\n",
       "[617 rows x 3 columns]"
      ]
     },
     "execution_count": 180,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# tagging 标记\n",
    "tagging_list = [\"\",\"马化腾\", \"腾讯\", \"微众银行\",\"腾讯复星\",\"腾讯风控\",\"腾讯支付\",\"WeChat\",\"We Remit\",\\\n",
    "                \"公益日\",\"红包\",\\\n",
    "                \"腾讯财付通\",\"鹅厂\",\"QQ钱包\",\"QQ红包\",\"QQ\",\\\n",
    "                \"只有一\",\"大咖\",\"听说\",\"图片\",\"照片\",\"小编\",\\\n",
    "                \"洗钱\", \"黑产\",\"被骗\",\"腾讯安全课\",\"诈骗\", \"炒股\",\"神秘兼职\",\"神秘组织\",\\\n",
    "                \"财付通\", \"品牌\",\\\n",
    "                \"收款\",\\\n",
    "                \"报告\",\\\n",
    "                \"银行卡\",\"理财\",\"选股\",\"发票\",\"基金\",\\\n",
    "                \"区块链\",\"金融云\",\"O2O\",\"农产品\",\"家乡\",\\\n",
    "                \"数据\", \"数据赋能\", \"智能\", \"数字孪生\", \"智慧大脑\",\\\n",
    "                \"出行\",\"乘车\",\"公交\",\"乘车码\", \"智慧地铁\",\\\n",
    "                \"高峰论坛\", \"智库\",\\\n",
    "                \"央行\",\"新规\", \\\n",
    "                \"微信\", \"微信支付\", \"跨境支付\", \"移动支付\",\"非银行支付\",\"电子支付\",\\\n",
    "                \"互联网金融\", \"金融科技\",\"互联网＋\",\"互联网+金融\",\"普惠金融\",\"虚拟银行\",\\\n",
    "                \"开放\",\"生态\",\"复杂\",\"互联网思维\",\"全球合作伙伴\",\\\n",
    "                \"联合国\", \"城市\", \"粤港澳大湾区\", \"平台\", \"可持续发展\", \"未来\", \"绿色\",\\\n",
    "                \"医护\",\"防护服\",\"小时\",\"武汉\",\"危机\",\"新冠肺炎\", \"疫\", \"疫情\", \"复工\",\"停课\",\"宅经济\",\\\n",
    "                \"基建\",\"新基建\"] #overwritable\n",
    "\n",
    "v_v_list = []\n",
    "\n",
    "for tag in tagging_list:\n",
    "    index_list = df_url_out [ df_url_out.title.str.contains(tag) ].index.tolist()\n",
    "    v_v_pairs = pd.DataFrame({tag:index_list}).melt().set_index(\"value\")\n",
    "    v_v_list.append(v_v_pairs)\n",
    "\n",
    "df_cat = v_v_list[0]\n",
    "for d in v_v_list:\n",
    "    df_cat.update(d)\n",
    "    \n",
    "# 尚未标记内容\n",
    "df_url_out.loc [ df_cat.query('variable==\"\"').index ]"
   ]
  },
  {
   "cell_type": "raw",
   "metadata": {},
   "source": [
    "df_url_out.loc[53].link"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "Empty DataFrame\n",
       "Columns: [title, create_time, link]\n",
       "Index: []"
      ]
     },
     "execution_count": 181,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out[df_url_out.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！</td>\n",
       "      <td>2019-12-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>超准！来测测你的2020年的运势！</td>\n",
       "      <td>2019-12-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>日薪300！每个月还额外送你100元奖励！</td>\n",
       "      <td>2019-12-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！</td>\n",
       "      <td>2019-12-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>超准！来测测你的2020年的运势！</td>\n",
       "      <td>2019-12-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>767</th>\n",
       "      <td>电话叫早员100元/次 冬天再冷也挡不住我们赚钱的热情！</td>\n",
       "      <td>2018-12-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>768</th>\n",
       "      <td>冬季的理想生活</td>\n",
       "      <td>2018-12-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>769</th>\n",
       "      <td>阿迪耐克工厂招兼职 没钱买新衣？不，我不允许你们这样！</td>\n",
       "      <td>2018-12-07</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>770</th>\n",
       "      <td>这是我见过的最厉害的帽子戏法</td>\n",
       "      <td>2018-12-07</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>771</th>\n",
       "      <td>开学！不出宿舍，日赚百元！</td>\n",
       "      <td>2018-09-03</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>772 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                            title create_time  \\\n",
       "0     想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！  2019-12-24   \n",
       "1               超准！来测测你的2020年的运势！  2019-12-24   \n",
       "2           日薪300！每个月还额外送你100元奖励！  2019-12-24   \n",
       "3     想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！  2019-12-23   \n",
       "4               超准！来测测你的2020年的运势！  2019-12-23   \n",
       "..                            ...         ...   \n",
       "767  电话叫早员100元/次 冬天再冷也挡不住我们赚钱的热情！  2018-12-08   \n",
       "768                       冬季的理想生活  2018-12-08   \n",
       "769   阿迪耐克工厂招兼职 没钱买新衣？不，我不允许你们这样！  2018-12-07   \n",
       "770                这是我见过的最厉害的帽子戏法  2018-12-07   \n",
       "771                 开学！不出宿舍，日赚百元！  2018-09-03   \n",
       "\n",
       "                                                  link  \n",
       "0    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "1    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "2    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "3    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "4    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "..                                                 ...  \n",
       "767  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "768  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "769  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "770  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "771  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...  \n",
       "\n",
       "[772 rows x 3 columns]"
      ]
     },
     "execution_count": 182,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out[~df_url_out.duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 183,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>variable</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！</td>\n",
       "      <td>2019-12-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>超准！来测测你的2020年的运势！</td>\n",
       "      <td>2019-12-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>日薪300！每个月还额外送你100元奖励！</td>\n",
       "      <td>2019-12-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！</td>\n",
       "      <td>2019-12-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>超准！来测测你的2020年的运势！</td>\n",
       "      <td>2019-12-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>767</th>\n",
       "      <td>电话叫早员100元/次 冬天再冷也挡不住我们赚钱的热情！</td>\n",
       "      <td>2018-12-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>768</th>\n",
       "      <td>冬季的理想生活</td>\n",
       "      <td>2018-12-08</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>769</th>\n",
       "      <td>阿迪耐克工厂招兼职 没钱买新衣？不，我不允许你们这样！</td>\n",
       "      <td>2018-12-07</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>770</th>\n",
       "      <td>这是我见过的最厉害的帽子戏法</td>\n",
       "      <td>2018-12-07</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>771</th>\n",
       "      <td>开学！不出宿舍，日赚百元！</td>\n",
       "      <td>2018-09-03</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>772 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                            title create_time  \\\n",
       "0     想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！  2019-12-24   \n",
       "1               超准！来测测你的2020年的运势！  2019-12-24   \n",
       "2           日薪300！每个月还额外送你100元奖励！  2019-12-24   \n",
       "3     想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！  2019-12-23   \n",
       "4               超准！来测测你的2020年的运势！  2019-12-23   \n",
       "..                            ...         ...   \n",
       "767  电话叫早员100元/次 冬天再冷也挡不住我们赚钱的热情！  2018-12-08   \n",
       "768                       冬季的理想生活  2018-12-08   \n",
       "769   阿迪耐克工厂招兼职 没钱买新衣？不，我不允许你们这样！  2018-12-07   \n",
       "770                这是我见过的最厉害的帽子戏法  2018-12-07   \n",
       "771                 开学！不出宿舍，日赚百元！  2018-09-03   \n",
       "\n",
       "                                                  link variable  \n",
       "0    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "1    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "2    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "3    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "4    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "..                                                 ...      ...  \n",
       "767  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "768  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "769  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "770  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "771  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "\n",
       "[772 rows x 4 columns]"
      ]
     },
     "execution_count": 183,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_o = df_url_out.join(df_cat).replace(\"\", np.nan).fillna(\"无法分类\")\n",
    "df_o"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 189,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "      <th>create_time</th>\n",
       "      <th>link</th>\n",
       "      <th>variable</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！</td>\n",
       "      <td>2019-12-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！</td>\n",
       "      <td>2019-12-23</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>周末在家，还能躺着赚点零花钱！日薪百元！要不要试试！</td>\n",
       "      <td>2019-12-21</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>马上就周末了！不打算在家赚点零花钱吗！</td>\n",
       "      <td>2019-12-20</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>132</th>\n",
       "      <td>今天！来赚点零花钱吗！</td>\n",
       "      <td>2019-10-24</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>140</th>\n",
       "      <td>推荐你一份手机好工作，赚点零花钱吧！</td>\n",
       "      <td>2019-10-20</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>144</th>\n",
       "      <td>推荐你一份工作，来赚点零花钱吧！</td>\n",
       "      <td>2019-10-18</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>146</th>\n",
       "      <td>推荐你一份工作，来赚点零花钱吧！</td>\n",
       "      <td>2019-10-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>428</th>\n",
       "      <td>这份工作，每天拿着手机不光能薅羊毛，还能挣点零花钱！</td>\n",
       "      <td>2019-07-19</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>432</th>\n",
       "      <td>这份工作，每天拿着手机不光能薅羊毛，还能挣点零花钱！</td>\n",
       "      <td>2019-07-18</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>436</th>\n",
       "      <td>这份工作，每天拿着手机不光能薅羊毛，还能挣点零花钱！</td>\n",
       "      <td>2019-07-17</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>440</th>\n",
       "      <td>这份工作，每天拿着手机不光能薅羊毛，还能挣点零花钱！</td>\n",
       "      <td>2019-07-16</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>724</th>\n",
       "      <td>周末在家无聊了？用手机赚点零花钱啊～</td>\n",
       "      <td>2019-04-20</td>\n",
       "      <td>http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...</td>\n",
       "      <td>无法分类</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                           title create_time  \\\n",
       "0    想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！  2019-12-24   \n",
       "3    想不想在家躺着赚点零花钱！我这有份好工作！很适合你啊！  2019-12-23   \n",
       "9     周末在家，还能躺着赚点零花钱！日薪百元！要不要试试！  2019-12-21   \n",
       "12           马上就周末了！不打算在家赚点零花钱吗！  2019-12-20   \n",
       "132                  今天！来赚点零花钱吗！  2019-10-24   \n",
       "140           推荐你一份手机好工作，赚点零花钱吧！  2019-10-20   \n",
       "144             推荐你一份工作，来赚点零花钱吧！  2019-10-18   \n",
       "146             推荐你一份工作，来赚点零花钱吧！  2019-10-17   \n",
       "428   这份工作，每天拿着手机不光能薅羊毛，还能挣点零花钱！  2019-07-19   \n",
       "432   这份工作，每天拿着手机不光能薅羊毛，还能挣点零花钱！  2019-07-18   \n",
       "436   这份工作，每天拿着手机不光能薅羊毛，还能挣点零花钱！  2019-07-17   \n",
       "440   这份工作，每天拿着手机不光能薅羊毛，还能挣点零花钱！  2019-07-16   \n",
       "724           周末在家无聊了？用手机赚点零花钱啊～  2019-04-20   \n",
       "\n",
       "                                                  link variable  \n",
       "0    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "3    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "9    http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "12   http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "132  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "140  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "144  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "146  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "428  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "432  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "436  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "440  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  \n",
       "724  http://mp.weixin.qq.com/s?__biz=MzUzNTkxOTQ2Nw...     无法分类  "
      ]
     },
     "execution_count": 189,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_o[df_o.title.str.contains(\"零花钱\")]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 190,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>title</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>variable</th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>无法分类</th>\n",
       "      <td>617</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>红包</th>\n",
       "      <td>71</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>微信</th>\n",
       "      <td>67</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>小时</th>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>听说</th>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>小编</th>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          title\n",
       "variable       \n",
       "无法分类        617\n",
       "红包           71\n",
       "微信           67\n",
       "小时           10\n",
       "听说            5\n",
       "小编            2"
      ]
     },
     "execution_count": 190,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_stats = df_o.groupby(by=\"variable\").agg({\"title\":\"count\"}).sort_values(by=\"title\", ascending=False)\n",
    "df_stats"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 输出"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 191,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_account.columns.name = \"rel_accounts\"\n",
    "df_o.columns.name = \"url_cat\"\n",
    "df_stats.columns.name = \"stats\"\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 192,
   "metadata": {},
   "outputs": [],
   "source": [
    "_df_.columns.name"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 193,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get the xlsxwriter workbook and worksheet objects.  \n",
    "with pd.ExcelWriter(fn[\"output\"][\"公众号_xlsx\"].format(公众号=公众号)) as writer:\n",
    "    workbook  = writer.book\n",
    "\n",
    "    for _df_ in [df_account, df_o, df_stats]:\n",
    "        _df_.to_excel(writer, sheet_name = _df_.columns.name)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
